import os,re,json
import jieba as jb

def cutAll(path):
    allowTypes=['txt','xhtml','html']
    if path.split('.')[-1] not in allowTypes:
        return []

    try:
        with open(path,'r',encoding='utf8') as f:
            text=f.read()
    except:
        try:
            with open(path,'r',encoding='gbk') as f:
                text=f.read()
        except:
            print('warn: '+path)
            return []
    # 过滤html标签
    html_tags_pat=r'<[!/]?[^>/]+[/]?>'
    text=text.replace('\n','')
    text=re.sub(html_tags_pat,'',text)
    res= jb.lcut(text)
    # print(res)
    return res


root='./bookself/'
fs=os.walk(root).__next__()
subs=fs[1]
raws=[]
for sub in subs:
    if sub=='stop':
        continue
    path=root+sub+'/'
    _tmp_fs=os.walk(path)
    for _f in _tmp_fs:
        fileRoot=_f[0]
        for fileName in _f[-1]:
            filePath=fileRoot+'/'+fileName
            # raws+=['-']+abstract(filePath)
            raws+=cutAll(filePath)

with open('./data/AllCutted.json','w+',encoding='utf8') as f:
    f.write(json.dumps(raws,ensure_ascii=False))
